////////////////////////////////////////////////////////////////
//
//      CFA line denoise by DCT filtering
//
//  copyright (c) 2008-2010  Emil Martinec <ejmartin@uchicago.edu>
//
//
// code dated: June 7, 2010
//
//  cfa_linedn_RT.cc is free software: you can redistribute it and/or modify
//  it under the terms of the GNU General Public License as published by
//  the Free Software Foundation, either version 3 of the License, or
//  (at your option) any later version.
//
//  This program is distributed in the hope that it will be useful,
//  but WITHOUT ANY WARRANTY; without even the implied warranty of
//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
//  GNU General Public License for more details.
//
//  You should have received a copy of the GNU General Public License
//  along with this program.  If not, see <http://www.gnu.org/licenses/>.
//
////////////////////////////////////////////////////////////////

// adaptions for photivo
// width -> m_Width
// height -> m_Height
// image -> m_Image

#include "../ptDefines.h"

#define TS 512    // Tile size

/*
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <float.h>
#include <limits.h>
#include <setjmp.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>*/
#include <cmath>


//#include "shrtdct_float.c"


//#define SQR(x) ((x)*(x))
//#define MIN(a,b) ((a) < (b) ? (a) : (b))
//#define MAX(a,b) ((a) > (b) ? (a) : (b))
//#define LIM(x,min,max) MAX(min,MIN(x,max))
//#define ULIM(x,y,z) ((y) < (z) ? LIM(x,y,z) : LIM(x,z,y))
//#define CLIP(x) LIM(x,0,65535)

// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

void ddct8x8s(int isgn, float **a);

// %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

void CLASS cfa_linedn(float noise)
{
  // local variables
  uint16_t height = m_Height, width = m_Width;

  const float clip_pt = 0.8*1.0 / MIN(MIN(m_D65Multipliers[0],m_D65Multipliers[1]),m_D65Multipliers[2])* 65535.0;

  float eps=1e-5;     //tolerance to avoid dividing by zero

  const float gauss[5] = {0.20416368871516755, 0.18017382291138087, 0.1238315368057753, 0.0662822452863612, 0.02763055063889883};
  const float rolloff[8] = {0, 0.135335, 0.249352, 0.411112, 0.606531, 0.800737, 0.945959, 1}; //gaussian with sigma=3
  const float window[8] = {0, .25, .75, 1, 1, .75, .25, 0}; //sine squared


  float aarr[4][8][8], *bbrr[4][8], **dctblock[4];
  for (int j=0; j<4; j++) {
    for (int i = 0; i < 8; i++)
      bbrr[j][i] = aarr[j][i];
    dctblock[j] = bbrr[j];
  }

  // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
  float noisevar=SQR(3*noise*65535); // _noise_ (as a fraction of saturation) is input to the algorithm

#pragma omp parallel
  {

  float *cfain= new float[TS*TS];
  float *cfablur= new float[TS*TS];
  float *cfadiff= new float[TS*TS];
  float *cfadn= new float[TS*TS];

/*
  char      *buffer;      // TS*TS*16
  float         (*cfain);     // TS*TS*4
  float           (*cfablur);   // TS*TS*4
  float           (*cfadiff);   // TS*TS*4
  float           (*cfadn);     // TS*TS*4
  buffer = (char *) calloc((3*sizeof(float)+sizeof(int))*TS*TS,1);// 16
//  merror(buffer,"cfa_linedn()");
//  memset(buffer,0,16*TS*TS);
  cfain     = (float (*))   buffer; //pointers to rows of array
  cfablur     = (float (*))     (buffer +  sizeof(float)*TS*TS); //4
  cfadiff     = (float (*))     (buffer +  2*sizeof(float)*TS*TS);//8
  cfadn     = (float (*))     (buffer +  3*sizeof(float)*TS*TS);//12
*/
  // Main algorithm: Tile loop
#pragma omp for schedule(dynamic) nowait
  for (int top=0; top < height-16; top += TS-32)
    for (int left=0; left < width-16; left += TS-32) {

      int bottom = MIN( top+TS, (int32_t)height);
      int right  = MIN(left+TS, (int32_t)width);
      int numrows = bottom - top;
      int numcols = right - left;
      int indx1;
      // load CFA data; data should be in linear gamma space, before white balance multipliers are applied
      for (int rr=top; rr < top+numrows; rr++)
        for (int cc=left, indx=(rr-top)*TS; cc < left+numcols; cc++, indx++) {

          cfain[indx] = m_Image[rr*width+cc][FC(rr,cc)];
        }
      //pad the block to a multiple of 16 on both sides

      if (numcols < TS) {
        indx1=numcols % 16;
        for (int i=0; i<(16-indx1); i++)
          for (int rr=0; rr<numrows; rr++)

            cfain[(rr)*TS+numcols+i+1]=cfain[(rr)*TS+numcols-i];
        numcols += 16-indx1;
      }

      if (numrows < TS) {
        indx1=numrows % 16;
        for (int i=0; i<(16-indx1); i++)
          for (int cc=0; cc<numcols; cc++)

            cfain[(numrows+i+1)*TS+cc]=cfain[(numrows-i)*TS+cc];
        numrows += 16-indx1;
      }

      //The cleaning algorithm starts here


      //gaussian blur of CFA data
      for (int rr=8; rr < numrows-8; rr++)
        for (int indx=rr*TS; indx < rr*TS+numcols; indx++) {

          cfablur[indx]=gauss[0]*cfain[indx];
          for (int i=1; i<5; i++) {
            cfablur[indx] += gauss[i]*(cfain[indx-(2*i)*TS]+cfain[indx+(2*i)*TS]);
          }
        }
      for (int rr=8; rr < numrows-8; rr++)
        for (int indx=rr*TS+8; indx < rr*TS+numcols-8; indx++) {

          cfadn[indx] = gauss[0]*cfablur[indx];
          for (int i=1; i<5; i++) {
            cfadn[indx] += gauss[i]*(cfablur[indx-2*i]+cfablur[indx+2*i]);
          }
          cfadiff[indx]=cfain[indx]-cfadn[indx]; // hipass cfa data
        }

      //begin block DCT
      float linehvar[4], linevvar[4], noisefactor[4][8][2], coeffsq;
#pragma omp critical
    {
      for (int rr=8; rr < numrows-22; rr+=8) // (rr,cc) shift by 8 to overlap blocks
        for (int cc=8; cc < numcols-22; cc+=8) {

          for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
            for (int ex=0; ex<2; ex++) {

              //grab an 8x8 block of a given RGGB channel
              for (int i=0; i<8; i++)
                for (int j=0; j<8; j++) {
                  dctblock[2*ey+ex][i][j]=cfadiff[(rr+2*i+ey)*TS+cc+2*j+ex];

                }

              ddct8x8s(-1, dctblock[2*ey+ex]); //forward DCT
            }
          for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
            for (int ex=0; ex<2; ex++) {
              linehvar[2*ey+ex]=linevvar[2*ey+ex]=0;
              for (int i=4; i<8; i++) {
                linehvar[2*ey+ex] += SQR(dctblock[2*ey+ex][0][i]);
                linevvar[2*ey+ex] += SQR(dctblock[2*ey+ex][i][0]);

              }
              //Wiener filter for line denoising; roll off low frequencies
              for (int i=1; i<8; i++) {
                coeffsq = SQR(dctblock[2*ey+ex][i][0]);//vertical
                noisefactor[2*ey+ex][i][0] = coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
                coeffsq = SQR(dctblock[2*ey+ex][0][i]);//horizontal
                noisefactor[2*ey+ex][i][1] = coeffsq/(coeffsq+rolloff[i]*noisevar+eps);
                // noisefactor labels are [RGGB subarray][row/col position][0=vert,1=hor]

                }
              }
          //horizontal lines
          if (4*noisevar>(linehvar[0]+linehvar[1])) {//horizontal lines
            for (int i=1; i<8; i++) {
              dctblock[0][0][i] *= 0.5*(noisefactor[0][i][1]+noisefactor[1][i][1]);//or should we use MIN???
              dctblock[1][0][i] *= 0.5*(noisefactor[0][i][1]+noisefactor[1][i][1]);//or should we use MIN???
            }
          }
          if (4*noisevar>(linehvar[2]+linehvar[3])) {//horizontal lines
            for (int i=1; i<8; i++) {
              dctblock[2][0][i] *= 0.5*(noisefactor[2][i][1]+noisefactor[3][i][1]);//or should we use MIN???
              dctblock[3][0][i] *= 0.5*(noisefactor[2][i][1]+noisefactor[3][i][1]);//or should we use MIN???
                }
              }

          //vertical lines
          if (4*noisevar>(linevvar[0]+linevvar[2])) {//vertical lines
            for (int i=1; i<8; i++) {
              dctblock[0][i][0] *= 0.5*(noisefactor[0][i][0]+noisefactor[2][i][0]);//or should we use MIN???
              dctblock[2][i][0] *= 0.5*(noisefactor[0][i][0]+noisefactor[2][i][0]);//or should we use MIN???
            }
          }
          if (4*noisevar>(linevvar[1]+linevvar[3])) {//vertical lines
            for (int i=1; i<8; i++) {
              dctblock[1][i][0] *= 0.5*(noisefactor[1][i][0]+noisefactor[3][i][0]);//or should we use MIN???
              dctblock[3][i][0] *= 0.5*(noisefactor[1][i][0]+noisefactor[3][i][0]);//or should we use MIN???
            }
          }

          for (int ey=0; ey<2; ey++) // (ex,ey) specify RGGB subarray
            for (int ex=0; ex<2; ex++) {
              ddct8x8s(1, dctblock[2*ey+ex]); //inverse DCT

              //multiply by window fn and add to output (cfadn)
              for (int i=0; i<8; i++)
                for (int j=0; j<8; j++) {
                  cfadn[(rr+2*i+ey)*TS+cc+2*j+ex] += window[i]*window[j]*dctblock[2*ey+ex][i][j];

                }
            }
      }
    }
      // %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
      // copy smoothed results back to image matrix
      for (int rr=16; rr < numrows-16; rr++) {
        int row = rr + top;
        for (int col=16+left, indx=rr*TS+16; indx < rr*TS+numcols-16; indx++, col++) {

          if (m_Image[row*width+col][FC(row,col)]<clip_pt && cfadn[indx]<clip_pt)
            m_Image[row*width+col][FC(row,col)] = CLIP((int32_t)(cfadn[indx]+ 0.5));
        }
      }
    }
  // clean up
  delete [] cfain;
  delete [] cfablur;
  delete [] cfadiff;
  delete [] cfadn;
  //free(buffer);
  }

}
#undef TS


//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
/*
 Discrete Cosine Transform Code

 Copyright(C) 1997 Takuya OOURA (email: ooura@mmm.t.u-tokyo.ac.jp).
 You may use, copy, modify this code for any purpose and
 without fee. You may distribute this ORIGINAL package.
 */


/*
 Short Discrete Cosine Transform
 data length :8x8
 method      :row-column, radix 4 FFT
 functions
 ddct8x8s  : 8x8 DCT
 function prototypes
 void ddct8x8s(int isgn, float **a);
 */


/*
 -------- 8x8 DCT (Discrete Cosine Transform) / Inverse of DCT --------
 [definition]
 <case1> Normalized 8x8 IDCT
 C[k1][k2] = (1/4) * sum_j1=0^7 sum_j2=0^7
 a[j1][j2] * s[j1] * s[j2] *
 cos(pi*j1*(k1+1/2)/8) *
 cos(pi*j2*(k2+1/2)/8), 0<=k1<8, 0<=k2<8
 (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
 <case2> Normalized 8x8 DCT
 C[k1][k2] = (1/4) * s[k1] * s[k2] * sum_j1=0^7 sum_j2=0^7
 a[j1][j2] *
 cos(pi*(j1+1/2)*k1/8) *
 cos(pi*(j2+1/2)*k2/8), 0<=k1<8, 0<=k2<8
 (s[0] = 1/sqrt(2), s[j] = 1, j > 0)
 [usage]
 <case1>
 ddct8x8s(1, a);
 <case2>
 ddct8x8s(-1, a);
 [parameters]
 a[0...7][0...7] :input/output data (double **)
 output data
 a[k1][k2] = C[k1][k2], 0<=k1<8, 0<=k2<8
 */


/* Cn_kR = sqrt(2.0/n) * cos(pi/2*k/n) */
/* Cn_kI = sqrt(2.0/n) * sin(pi/2*k/n) */
/* Wn_kR = cos(pi/2*k/n) */
/* Wn_kI = sin(pi/2*k/n) */
#define C8_1R   0.49039264020161522456
#define C8_1I   0.09754516100806413392
#define C8_2R   0.46193976625564337806
#define C8_2I   0.19134171618254488586
#define C8_3R   0.41573480615127261854
#define C8_3I   0.27778511650980111237
#define C8_4R   0.35355339059327376220
#define W8_4R   0.70710678118654752440


void CLASS ddct8x8s(int isgn, float **a)
{
    int j;
    float x0r, x0i, x1r, x1i, x2r, x2i, x3r, x3i;
    float xr, xi;

    if (isgn < 0) {
        for (j = 0; j <= 7; j++) {
            x0r = a[0][j] + a[7][j];
            x1r = a[0][j] - a[7][j];
            x0i = a[2][j] + a[5][j];
            x1i = a[2][j] - a[5][j];
            x2r = a[4][j] + a[3][j];
            x3r = a[4][j] - a[3][j];
            x2i = a[6][j] + a[1][j];
            x3i = a[6][j] - a[1][j];
            xr = x0r + x2r;
            xi = x0i + x2i;
            a[0][j] = C8_4R * (xr + xi);
            a[4][j] = C8_4R * (xr - xi);
            xr = x0r - x2r;
            xi = x0i - x2i;
            a[2][j] = C8_2R * xr - C8_2I * xi;
            a[6][j] = C8_2R * xi + C8_2I * xr;
            xr = W8_4R * (x1i - x3i);
            x1i = W8_4R * (x1i + x3i);
            x3i = x1i - x3r;
            x1i += x3r;
            x3r = x1r - xr;
            x1r += xr;
            a[1][j] = C8_1R * x1r - C8_1I * x1i;
            a[7][j] = C8_1R * x1i + C8_1I * x1r;
            a[3][j] = C8_3R * x3r - C8_3I * x3i;
            a[5][j] = C8_3R * x3i + C8_3I * x3r;
        }
        for (j = 0; j <= 7; j++) {
            x0r = a[j][0] + a[j][7];
            x1r = a[j][0] - a[j][7];
            x0i = a[j][2] + a[j][5];
            x1i = a[j][2] - a[j][5];
            x2r = a[j][4] + a[j][3];
            x3r = a[j][4] - a[j][3];
            x2i = a[j][6] + a[j][1];
            x3i = a[j][6] - a[j][1];
            xr = x0r + x2r;
            xi = x0i + x2i;
            a[j][0] = C8_4R * (xr + xi);
            a[j][4] = C8_4R * (xr - xi);
            xr = x0r - x2r;
            xi = x0i - x2i;
            a[j][2] = C8_2R * xr - C8_2I * xi;
            a[j][6] = C8_2R * xi + C8_2I * xr;
            xr = W8_4R * (x1i - x3i);
            x1i = W8_4R * (x1i + x3i);
            x3i = x1i - x3r;
            x1i += x3r;
            x3r = x1r - xr;
            x1r += xr;
            a[j][1] = C8_1R * x1r - C8_1I * x1i;
            a[j][7] = C8_1R * x1i + C8_1I * x1r;
            a[j][3] = C8_3R * x3r - C8_3I * x3i;
            a[j][5] = C8_3R * x3i + C8_3I * x3r;
        }
    } else {
        for (j = 0; j <= 7; j++) {
            x1r = C8_1R * a[1][j] + C8_1I * a[7][j];
            x1i = C8_1R * a[7][j] - C8_1I * a[1][j];
            x3r = C8_3R * a[3][j] + C8_3I * a[5][j];
            x3i = C8_3R * a[5][j] - C8_3I * a[3][j];
            xr = x1r - x3r;
            xi = x1i + x3i;
            x1r += x3r;
            x3i -= x1i;
            x1i = W8_4R * (xr + xi);
            x3r = W8_4R * (xr - xi);
            xr = C8_2R * a[2][j] + C8_2I * a[6][j];
            xi = C8_2R * a[6][j] - C8_2I * a[2][j];
            x0r = C8_4R * (a[0][j] + a[4][j]);
            x0i = C8_4R * (a[0][j] - a[4][j]);
            x2r = x0r - xr;
            x2i = x0i - xi;
            x0r += xr;
            x0i += xi;
            a[0][j] = x0r + x1r;
            a[7][j] = x0r - x1r;
            a[2][j] = x0i + x1i;
            a[5][j] = x0i - x1i;
            a[4][j] = x2r - x3i;
            a[3][j] = x2r + x3i;
            a[6][j] = x2i - x3r;
            a[1][j] = x2i + x3r;
        }
        for (j = 0; j <= 7; j++) {
            x1r = C8_1R * a[j][1] + C8_1I * a[j][7];
            x1i = C8_1R * a[j][7] - C8_1I * a[j][1];
            x3r = C8_3R * a[j][3] + C8_3I * a[j][5];
            x3i = C8_3R * a[j][5] - C8_3I * a[j][3];
            xr = x1r - x3r;
            xi = x1i + x3i;
            x1r += x3r;
            x3i -= x1i;
            x1i = W8_4R * (xr + xi);
            x3r = W8_4R * (xr - xi);
            xr = C8_2R * a[j][2] + C8_2I * a[j][6];
            xi = C8_2R * a[j][6] - C8_2I * a[j][2];
            x0r = C8_4R * (a[j][0] + a[j][4]);
            x0i = C8_4R * (a[j][0] - a[j][4]);
            x2r = x0r - xr;
            x2i = x0i - xi;
            x0r += xr;
            x0i += xi;
            a[j][0] = x0r + x1r;
            a[j][7] = x0r - x1r;
            a[j][2] = x0i + x1i;
            a[j][5] = x0i - x1i;
            a[j][4] = x2r - x3i;
            a[j][3] = x2r + x3i;
            a[j][6] = x2i - x3r;
            a[j][1] = x2i + x3r;
        }
    }
}

